Code (and analysis) from Ming Tang

(see “TCR_concordance.html” from 9/10/2021 stored in TCR_Concordance Dropbox)


library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.1     ✓ dplyr   1.0.6
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
tcr1<- read_tsv("/Users/jen/Dropbox/TCR_Concordance/concord_data/E4412_1A_TIL_CONTROL_DNA.tsv", guess_max = 1000000)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_character(),
##   sku = col_logical(),
##   test_name = col_logical(),
##   sample_catalog_tags = col_logical(),
##   sample_rich_tags = col_logical(),
##   sample_rich_tags_json = col_logical(),
##   hla_class_i = col_logical(),
##   hla_class_ii = col_logical(),
##   kit_control = col_logical(),
##   total_templates = col_double(),
##   productive_templates = col_double(),
##   outofframe_templates = col_double(),
##   stop_templates = col_double(),
##   dj_templates = col_double(),
##   total_rearrangements = col_double(),
##   productive_rearrangements = col_double(),
##   outofframe_rearrangements = col_double(),
##   stop_rearrangements = col_double(),
##   dj_rearrangements = col_double(),
##   total_reads = col_logical(),
##   total_productive_reads = col_logical()
##   # ... with 51 more columns
## )
## ℹ Use `spec()` for the full column specifications.
tcr2<- read_tsv("/Users/jen/Dropbox/TCR_Concordance/concord_data/E4412_1B_TIL_CONTROL.tsv", guess_max = 1000000)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_character(),
##   sku = col_logical(),
##   test_name = col_logical(),
##   sample_catalog_tags = col_logical(),
##   sample_rich_tags = col_logical(),
##   sample_rich_tags_json = col_logical(),
##   hla_class_i = col_logical(),
##   hla_class_ii = col_logical(),
##   kit_control = col_logical(),
##   total_templates = col_double(),
##   productive_templates = col_double(),
##   outofframe_templates = col_double(),
##   stop_templates = col_double(),
##   dj_templates = col_double(),
##   total_rearrangements = col_double(),
##   productive_rearrangements = col_double(),
##   outofframe_rearrangements = col_double(),
##   stop_rearrangements = col_double(),
##   dj_rearrangements = col_double(),
##   total_reads = col_logical(),
##   total_productive_reads = col_logical()
##   # ... with 50 more columns
## )
## ℹ Use `spec()` for the full column specifications.
dim(tcr1)
## [1] 1652  118
dim(tcr2)
## [1] 1936  118
colnames(tcr1)
##   [1] "sample_name"                               
##   [2] "species"                                   
##   [3] "locus"                                     
##   [4] "product_subtype"                           
##   [5] "kit_pool"                                  
##   [6] "sku"                                       
##   [7] "test_name"                                 
##   [8] "sample_catalog_tags"                       
##   [9] "sample_rich_tags"                          
##  [10] "sample_rich_tags_json"                     
##  [11] "hla_class_i"                               
##  [12] "hla_class_ii"                              
##  [13] "kit_control"                               
##  [14] "total_templates"                           
##  [15] "productive_templates"                      
##  [16] "outofframe_templates"                      
##  [17] "stop_templates"                            
##  [18] "dj_templates"                              
##  [19] "total_rearrangements"                      
##  [20] "productive_rearrangements"                 
##  [21] "outofframe_rearrangements"                 
##  [22] "stop_rearrangements"                       
##  [23] "dj_rearrangements"                         
##  [24] "total_reads"                               
##  [25] "total_productive_reads"                    
##  [26] "total_outofframe_reads"                    
##  [27] "total_stop_reads"                          
##  [28] "total_dj_reads"                            
##  [29] "productive_simpson_clonality"              
##  [30] "productive_clonality"                      
##  [31] "productive_entropy"                        
##  [32] "sample_simpson_clonality"                  
##  [33] "sample_clonality"                          
##  [34] "sample_entropy"                            
##  [35] "sample_amount_ng"                          
##  [36] "sample_cells_mass_estimate"                
##  [37] "fraction_productive_of_cells_mass_estimate"
##  [38] "sample_cells"                              
##  [39] "fraction_productive_of_cells"              
##  [40] "max_productive_frequency"                  
##  [41] "max_frequency"                             
##  [42] "counting_method"                           
##  [43] "primer_set"                                
##  [44] "sequence_result_status"                    
##  [45] "release_date"                              
##  [46] "upload_date"                               
##  [47] "sample_tags"                               
##  [48] "fraction_productive"                       
##  [49] "order_name"                                
##  [50] "kit_id"                                    
##  [51] "total_t_cells"                             
##  [52] "total_templates_agg"                       
##  [53] "rearrangement"                             
##  [54] "amino_acid"                                
##  [55] "frame_type"                                
##  [56] "rearrangement_type"                        
##  [57] "templates"                                 
##  [58] "seq_reads"                                 
##  [59] "frequency"                                 
##  [60] "productive_frequency"                      
##  [61] "cdr3_length"                               
##  [62] "v_family"                                  
##  [63] "v_gene"                                    
##  [64] "v_allele"                                  
##  [65] "d_family"                                  
##  [66] "d_gene"                                    
##  [67] "d_allele"                                  
##  [68] "j_family"                                  
##  [69] "j_gene"                                    
##  [70] "j_allele"                                  
##  [71] "v_deletions"                               
##  [72] "d5_deletions"                              
##  [73] "d3_deletions"                              
##  [74] "j_deletions"                               
##  [75] "n2_insertions"                             
##  [76] "n1_insertions"                             
##  [77] "v_index"                                   
##  [78] "n1_index"                                  
##  [79] "n2_index"                                  
##  [80] "d_index"                                   
##  [81] "j_index"                                   
##  [82] "v_family_ties"                             
##  [83] "v_gene_ties"                               
##  [84] "v_allele_ties"                             
##  [85] "d_family_ties"                             
##  [86] "d_gene_ties"                               
##  [87] "d_allele_ties"                             
##  [88] "j_family_ties"                             
##  [89] "j_gene_ties"                               
##  [90] "j_allele_ties"                             
##  [91] "sequence_tags"                             
##  [92] "v_shm_count"                               
##  [93] "v_shm_indexes"                             
##  [94] "antibody"                                  
##  [95] "bio_identity"                              
##  [96] "rearrangement_trunc"                       
##  [97] "v_resolved"                                
##  [98] "d_resolved"                                
##  [99] "j_resolved"                                
## [100] "extended_rearrangement"                    
## [101] "cdr1_rearrangement"                        
## [102] "cdr1_amino_acid"                           
## [103] "cdr1_start_index"                          
## [104] "cdr1_rearrangement_length"                 
## [105] "cdr2_rearrangement"                        
## [106] "cdr2_amino_acid"                           
## [107] "cdr2_start_index"                          
## [108] "cdr2_rearrangement_length"                 
## [109] "cdr3_rearrangement"                        
## [110] "cdr3_amino_acid"                           
## [111] "cdr3_start_index"                          
## [112] "cdr3_rearrangement_length"                 
## [113] "chosen_v_family"                           
## [114] "chosen_v_gene"                             
## [115] "chosen_v_allele"                           
## [116] "chosen_j_family"                           
## [117] "chosen_j_gene"                             
## [118] "chosen_j_allele"
tcr1<- tcr1 %>%
  select(productive_frequency, cdr3_amino_acid, v_gene, d_gene, j_gene) %>%
  filter(!is.na(productive_frequency)) %>%
  mutate(sample = "1A")

tcr2<- tcr2 %>%
  select(productive_frequency, cdr3_amino_acid, v_gene, d_gene, j_gene) %>%
  filter(!is.na(productive_frequency)) %>%
  mutate(sample = "1B")

dim(tcr1)
## [1] 1282    6
dim(tcr2)
## [1] 1361    6
tcr1_2<- inner_join(tcr1, tcr2, by=c("v_gene" = "v_gene", "j_gene" ="j_gene", "cdr3_amino_acid"= "cdr3_amino_acid"))

ggplot(tcr1_2, aes(x=productive_frequency.x, y= productive_frequency.y)) +
  geom_point()+
  scale_x_log10() +
  scale_y_log10() +
  geom_abline(slope = 1, intercept = 0, linetype=2, color = "red") +
  theme_classic(base_size = 14) +
    ggtitle("correlation of productive frequency of all overlapping clones")

cor(tcr1_2$productive_frequency.x, tcr1_2$productive_frequency.y)
## [1] 0.9991812
## check the productive frequecy distribution. most of them are below 0.1%
ggplot(tcr1, aes(x= productive_frequency))+
  geom_histogram(bins=100) +
  scale_x_log10() +
  geom_abline(slope = 1, intercept = 0, linetype=2, color = "red") +
  theme_classic(base_size = 14)

## 99% of the clones frequency < 0.01
mean(tcr1$productive_frequency <0.01)
## [1] 0.9914197
## 82 clones have frequency > 0.001, so, checking top 50 or top 100 clones makes sense.
sum(tcr1$productive_frequency > 0.001)
## [1] 82
## 76 clones
sum(tcr2$productive_frequency > 0.001)
## [1] 76
### Check the correlation for the top 50 clones

## there are ties, so 53 clones were selected 
tcr1_top50<- tcr1 %>%
  slice_max(order_by = productive_frequency, n = 50, with_ties = TRUE)

tcr1_2_top50<- inner_join(tcr1_top50, tcr2, by=c("v_gene" = "v_gene", "j_gene" ="j_gene", "cdr3_amino_acid"= "cdr3_amino_acid"))

## 53 clones appear in both 1A and 1B samples!
dim(tcr1_2_top50)
## [1] 53  9
ggplot(tcr1_2_top50, aes(x=productive_frequency.x, y= productive_frequency.y)) +
  geom_point()+
  scale_x_log10() +
  scale_y_log10() +
  geom_abline(slope = 1, intercept = 0, linetype=2, color = "red") +
  theme_classic(base_size = 14) +
  ggtitle("correlation of productive frequency of top 50 clones")

cor(tcr1_2_top50$productive_frequency.x, tcr1_2_top50$productive_frequency.y)
## [1] 0.9991644

New Code (Attempt to make loop for all files in a directory)

library(tidyverse)
library(dplyr)
library(ggplot2)

#set input directory
indir<-"/Users/jen/Dropbox/TCR_Concordance/concord_data"
outdir<-"/Users/jen/Dropbox/TCR_Concordance/concord_output/2021_09_14/"

#create read table function that can accept a variable as a table name
read_tcr<-function(k){
  tcr_table<-read.table(k,header=T,sep="\t")
  tcr_table<-tcr_table %>%
    select(productive_frequency, cdr3_amino_acid, v_gene, d_gene, j_gene) %>%
    filter(!is.na(productive_frequency))
  return(tcr_table)
}

# get list of filenames from input directory
ff <- list.files(path=indir, full.names=TRUE)

# apply read_tcr function to list of files, add file names to each list member
myfilelist <- lapply(ff, read_tcr)
names(myfilelist)<-list.files(path=indir, full.names=FALSE)

#create separate file list with top 50
read_tcr_top<-function(s){
  tcr_table_50<-read.table(s,header=T,sep="\t")
  tcr_table_50<-tcr_table_50 %>%
    select(productive_frequency, cdr3_amino_acid, v_gene, d_gene, j_gene) %>%
    filter(!is.na(productive_frequency)) %>%
    slice_max(order_by = productive_frequency, n = 50, with_ties = TRUE)
  return(tcr_table_50)
}

#create separate file list with top 100
read_tcr_top100<-function(s){
  tcr_table_100<-read.table(s,header=T,sep="\t")
  tcr_table_100<-tcr_table_100 %>%
    select(productive_frequency, cdr3_amino_acid, v_gene, d_gene, j_gene) %>%
    filter(!is.na(productive_frequency)) %>%
    slice_max(order_by = productive_frequency, n = 100, with_ties = TRUE)
  return(tcr_table_100)
}

#create separate file list with top 500
read_tcr_top500<-function(s){
  tcr_table_500<-read.table(s,header=T,sep="\t")
  tcr_table_500<-tcr_table_500 %>%
    select(productive_frequency, cdr3_amino_acid, v_gene, d_gene, j_gene) %>%
    filter(!is.na(productive_frequency)) %>%
    slice_max(order_by = productive_frequency, n = 500, with_ties = TRUE)
  return(tcr_table_500)
}

myfilelist_top50<- lapply(ff,read_tcr_top)
names(myfilelist_top50)<-list.files(path=indir, full.names=FALSE)

myfilelist_top100<-lapply(ff,read_tcr_top100)
names(myfilelist_top100)<-list.files(path=indir, full.names=FALSE)

myfilelist_top500<-lapply(ff,read_tcr_top500)
names(myfilelist_top500)<-list.files(path=indir, full.names=FALSE)
########################################
##### Productive Frequency Ranges ######
########################################

NameList<-list.files(path=indir, full.names=FALSE)

# get ranges of productive frequencies
prod_freq_matrix<-matrix(nrow=length(myfilelist),ncol=12)
rownames(prod_freq_matrix)<-NameList
colnames(prod_freq_matrix)<-c("all_clones_min","all_clone_max","all_clones_mean","top500_min","top500_max","top500_mean","top100_min","top100_max","top100_mean", "top50_min","top50_max","top50_mean")

## range for all clones
for (i in 1:length(myfilelist)){
  prod_freq_matrix[i,1:3]<-c(range(myfilelist[[i]]$productive_frequency),mean(myfilelist[[i]]$productive_frequency))
  prod_freq_matrix[i,4:6]<-c(range(myfilelist_top500[[i]]$productive_frequency),mean(myfilelist_top500[[i]]$productive_frequency))
  prod_freq_matrix[i,7:9]<-c(range(myfilelist_top100[[i]]$productive_frequency),mean(myfilelist_top100[[i]]$productive_frequency))
  prod_freq_matrix[i,10:12]<-c(range(myfilelist_top50[[i]]$productive_frequency),mean(myfilelist_top50[[i]]$productive_frequency))
}

# print productive frequency summary table
prod_freq_matrix %>% 
  DT::datatable()

Check Analysis using “d_gene” in addition to other matches.

In the initial analysis, it seemed as if including the d_gene in the inner_join resulted in a lot of lost matches. Double checking…

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
#create inner_join function that include d gene
merge_tcrs_withd<-function(dd1, dd2){
  merged_files_d<-inner_join(dd1, dd2, by=c("v_gene" = "v_gene", "j_gene" ="j_gene", "cdr3_amino_acid"= "cdr3_amino_acid","d_gene"="d_gene"))
  return(merged_files_d)
}

#make empty lists for plots and correlations
myplots_withd<-list()
mycorrs_withd<-list()
mydims_withd<-list()

#run loop such that i=top 50 clone tsvs and h=full clone list
#e.g. myplots[[1]][[2]]= a comparison of the top50 clones of file 1 to
# all clones in file 2

for (i in 1:length(myfilelist)){
  #create plot and correlation list for file i within myplot_list
  myplots_withd[[i]]<-list()
  mycorrs_withd[[i]]<-list()
  mydims_withd[[i]]<-list()
  
  for (h in 1:length(myfilelist)){
    if (i==h) next
    tcr_i_h_withd<-merge_tcrs_withd(myfilelist_top50[[i]],myfilelist[[h]])
    
    p_withd<-ggplot(tcr_i_h_withd, aes(x=productive_frequency.x, y= productive_frequency.y)) +
      geom_point()+
      scale_x_log10() +
      scale_y_log10() +
      geom_abline(slope = 1, intercept = 0, linetype=2, color = "red") +
      theme_classic(base_size = 14) +
      ggtitle("correlation of productive frequency of top 50 clones with d genes")
  
    correlation_withd<-cor(tcr_i_h_withd$productive_frequency.x, tcr_i_h_withd$productive_frequency.y)
    dimension50_withd<-dim(tcr_i_h_withd)
    
    myplots_withd[[i]][[h]]<-p_withd
    mycorrs_withd[[i]][[h]]<-correlation_withd
    mydims_withd[[i]][[h]]<-dimension50_withd
    
  }
}

names(myplots_withd)<-list.files(path=indir, full.names=FALSE)
names(mydims_withd)<-list.files(path=indir, full.names=FALSE)

# function to display all correlations and plots.
# will also write to file
disp_plots_corr_withd<-function(file1,file2){
  cat(paste("Top 50 clones of ",NameList[file1],"compared to clones of ",NameList[file2],"with d genes","\n"))
  cat(paste("Pearson Correlation = ",mycorrs_withd[[file1]][[file2]]))
  print(myplots_withd[[file1]][[file2]])
  #print plots to file
  png_file<-paste(outdir,"top50",NameList[file1],"_",NameList[file2],"withD.png",sep="")
  png(file=png_file,width=600, height=350)
  print(myplots_withd[[file1]][[file2]])
  dev.off()
}


for (i in 1:length(myfilelist)){
  for (h in 1:length(myfilelist)){
    if (i==h) next
    disp_plots_corr_withd(i,h)
  }
}
## Top 50 clones of  DFCI_20210908_DNAreference.tsv compared to clones of  E4412_1A_TIL_CONTROL_DNA.tsv with d genes 
## Pearson Correlation =  0.999166262046191

## Top 50 clones of  DFCI_20210908_DNAreference.tsv compared to clones of  E4412_1B_TIL_CONTROL.tsv with d genes 
## Pearson Correlation =  0.99888032257963

## Top 50 clones of  DFCI_20210908_DNAreference.tsv compared to clones of  E4412_2A_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.998998416207627

## Top 50 clones of  DFCI_20210908_DNAreference.tsv compared to clones of  E4412_2B_CONTROLDNA2.tsv with d genes 
## Pearson Correlation =  0.997190953292622

## Top 50 clones of  DFCI_20210908_DNAreference.tsv compared to clones of  E4412_3_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.996244044548813

## Top 50 clones of  E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of  DFCI_20210908_DNAreference.tsv with d genes 
## Pearson Correlation =  0.999159265145344

## Top 50 clones of  E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of  E4412_1B_TIL_CONTROL.tsv with d genes 
## Pearson Correlation =  0.999164435857467

## Top 50 clones of  E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of  E4412_2A_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.999261819151957

## Top 50 clones of  E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of  E4412_2B_CONTROLDNA2.tsv with d genes 
## Pearson Correlation =  0.99671984709148

## Top 50 clones of  E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of  E4412_3_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.997875729695303

## Top 50 clones of  E4412_1B_TIL_CONTROL.tsv compared to clones of  DFCI_20210908_DNAreference.tsv with d genes 
## Pearson Correlation =  0.998879745493858

## Top 50 clones of  E4412_1B_TIL_CONTROL.tsv compared to clones of  E4412_1A_TIL_CONTROL_DNA.tsv with d genes 
## Pearson Correlation =  0.999168240179539

## Top 50 clones of  E4412_1B_TIL_CONTROL.tsv compared to clones of  E4412_2A_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.999406679771663

## Top 50 clones of  E4412_1B_TIL_CONTROL.tsv compared to clones of  E4412_2B_CONTROLDNA2.tsv with d genes 
## Pearson Correlation =  0.997272809748641

## Top 50 clones of  E4412_1B_TIL_CONTROL.tsv compared to clones of  E4412_3_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.997806117133848

## Top 50 clones of  E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of  DFCI_20210908_DNAreference.tsv with d genes 
## Pearson Correlation =  0.999015320734871

## Top 50 clones of  E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_1A_TIL_CONTROL_DNA.tsv with d genes 
## Pearson Correlation =  0.999293924043519

## Top 50 clones of  E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_1B_TIL_CONTROL.tsv with d genes 
## Pearson Correlation =  0.999447523996795

## Top 50 clones of  E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_2B_CONTROLDNA2.tsv with d genes 
## Pearson Correlation =  0.998375106963167

## Top 50 clones of  E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_3_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.998260342740857

## Top 50 clones of  E4412_2B_CONTROLDNA2.tsv compared to clones of  DFCI_20210908_DNAreference.tsv with d genes 
## Pearson Correlation =  0.997215372056811

## Top 50 clones of  E4412_2B_CONTROLDNA2.tsv compared to clones of  E4412_1A_TIL_CONTROL_DNA.tsv with d genes 
## Pearson Correlation =  0.99677527545078

## Top 50 clones of  E4412_2B_CONTROLDNA2.tsv compared to clones of  E4412_1B_TIL_CONTROL.tsv with d genes 
## Pearson Correlation =  0.997295454351495

## Top 50 clones of  E4412_2B_CONTROLDNA2.tsv compared to clones of  E4412_2A_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.998410838846518

## Top 50 clones of  E4412_2B_CONTROLDNA2.tsv compared to clones of  E4412_3_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.996442387751591

## Top 50 clones of  E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of  DFCI_20210908_DNAreference.tsv with d genes 
## Pearson Correlation =  0.996315685425047

## Top 50 clones of  E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_1A_TIL_CONTROL_DNA.tsv with d genes 
## Pearson Correlation =  0.997973590995711

## Top 50 clones of  E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_1B_TIL_CONTROL.tsv with d genes 
## Pearson Correlation =  0.997916899185321

## Top 50 clones of  E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_2A_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.998363528949173

## Top 50 clones of  E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_2B_CONTROLDNA2.tsv with d genes 
## Pearson Correlation =  0.996453863578923

#make matrix from mycorrs such that rows = top clones data and cols = all clones datas
mycorrs_mx_withd<-matrix(nrow=length(myfilelist),ncol=length(myfilelist))
for (i in 1:length(myfilelist)){
  for (h in 1:length(myfilelist)){
    if (i==h){
      mycorrs_mx_withd[i,h]<-NA
  }else{
      mycorrs_mx_withd[i,h]<-mycorrs_withd[[i]][[h]]}
  }
}


rownames(mycorrs_mx_withd)<-NameList
colnames(mycorrs_mx_withd)<-NameList

#prepare correlation matrix for heatmap
melted_corr_withd <- melt(mycorrs_mx_withd)

#plot correlation heatmap
ggplot(data = melted_corr_withd, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Pearson\nCorrelation") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
  ggtitle("Pearson correlations for top 50 clones, d genes included") + 
  xlab("") +
  ylab("")

# Heatmap not very informative, maybe print table?
mycorrs_mx_withd %>% 
  DT::datatable()
##############################################
### Repeat analysis with top 100 clones  #####
##############################################

#make empty lists for plots and correlations
myplots100_withd<-list()
mycorrs100_withd<-list()
mydims100_withd<-list()
#run loop such that i=top 100 clone tsvs and h=full clone list
#e.g. myplots[[1]][[2]]= a comparison of the top100 clones of file 1 to
# all clones in file 2

for (i in 1:length(myfilelist)){
  #create plot and correlation list for file i within myplot_list
  myplots100_withd[[i]]<-list()
  mycorrs100_withd[[i]]<-list()
  mydims100_withd[[i]]<-list()
  
  for (h in 1:length(myfilelist)){
    if (i==h) next
    tcr100_i_h_withd<-merge_tcrs_withd(myfilelist_top100[[i]],myfilelist[[h]])
    
    p100_withd<-ggplot(tcr100_i_h_withd, aes(x=productive_frequency.x, y= productive_frequency.y)) +
      geom_point()+
      scale_x_log10() +
      scale_y_log10() +
      geom_abline(slope = 1, intercept = 0, linetype=2, color = "red") +
      theme_classic(base_size = 14) +
      ggtitle("correlation of productive frequency of top 100 clones with d genes")
  
    correlation100_withd<-cor(tcr100_i_h_withd$productive_frequency.x, tcr100_i_h_withd$productive_frequency.y)
    dimensions100_withd<-dim(tcr100_i_h_withd)
    
    myplots100_withd[[i]][[h]]<-p100_withd
    mycorrs100_withd[[i]][[h]]<-correlation100_withd
    mydims100_withd[[i]][[h]]<-dimensions100_withd
    
  }
}

names(myplots100_withd)<-list.files(path=indir, full.names=FALSE)
names(mydims100_withd)<-list.files(path=indir, full.names=FALSE)

# function to display all correlations and plots for top100.
# will also write to file
disp_plots_corr100_withd<-function(file1,file2){
  cat(paste("Top 100 clones of ",NameList[file1],"compared to clones of ",NameList[file2],"with d genes","\n"))
  cat(paste("Pearson Correlation = ",mycorrs100_withd[[file1]][[file2]]))
  print(myplots100_withd[[file1]][[file2]])
  #print plots to file
  png_file<-paste(outdir,"top100",NameList[file1],"_",NameList[file2],"withD.png",sep="")
  png(file=png_file,width=600, height=350)
  print(myplots100_withd[[file1]][[file2]])
  dev.off()
}


for (i in 1:length(myfilelist)){
  for (h in 1:length(myfilelist)){
    if (i==h) next
    disp_plots_corr100_withd(i,h)
  }
}
## Top 100 clones of  DFCI_20210908_DNAreference.tsv compared to clones of  E4412_1A_TIL_CONTROL_DNA.tsv with d genes 
## Pearson Correlation =  0.999178075235607

## Top 100 clones of  DFCI_20210908_DNAreference.tsv compared to clones of  E4412_1B_TIL_CONTROL.tsv with d genes 
## Pearson Correlation =  0.998916855799233

## Top 100 clones of  DFCI_20210908_DNAreference.tsv compared to clones of  E4412_2A_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.998988330114623

## Top 100 clones of  DFCI_20210908_DNAreference.tsv compared to clones of  E4412_2B_CONTROLDNA2.tsv with d genes 
## Pearson Correlation =  0.997163144858478

## Top 100 clones of  DFCI_20210908_DNAreference.tsv compared to clones of  E4412_3_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.996412284959479

## Top 100 clones of  E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of  DFCI_20210908_DNAreference.tsv with d genes 
## Pearson Correlation =  0.999164027698048

## Top 100 clones of  E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of  E4412_1B_TIL_CONTROL.tsv with d genes 
## Pearson Correlation =  0.999176761532961

## Top 100 clones of  E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of  E4412_2A_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.999179611958061

## Top 100 clones of  E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of  E4412_2B_CONTROLDNA2.tsv with d genes 
## Pearson Correlation =  0.996639031611959

## Top 100 clones of  E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of  E4412_3_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.997941597424477

## Top 100 clones of  E4412_1B_TIL_CONTROL.tsv compared to clones of  DFCI_20210908_DNAreference.tsv with d genes 
## Pearson Correlation =  0.99891571360525

## Top 100 clones of  E4412_1B_TIL_CONTROL.tsv compared to clones of  E4412_1A_TIL_CONTROL_DNA.tsv with d genes 
## Pearson Correlation =  0.999182005603366

## Top 100 clones of  E4412_1B_TIL_CONTROL.tsv compared to clones of  E4412_2A_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.999372729041565

## Top 100 clones of  E4412_1B_TIL_CONTROL.tsv compared to clones of  E4412_2B_CONTROLDNA2.tsv with d genes 
## Pearson Correlation =  0.997237233718641

## Top 100 clones of  E4412_1B_TIL_CONTROL.tsv compared to clones of  E4412_3_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.99789992542615

## Top 100 clones of  E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of  DFCI_20210908_DNAreference.tsv with d genes 
## Pearson Correlation =  0.999003488597187

## Top 100 clones of  E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_1A_TIL_CONTROL_DNA.tsv with d genes 
## Pearson Correlation =  0.999225057234736

## Top 100 clones of  E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_1B_TIL_CONTROL.tsv with d genes 
## Pearson Correlation =  0.999402859140985

## Top 100 clones of  E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_2B_CONTROLDNA2.tsv with d genes 
## Pearson Correlation =  0.998417367243602

## Top 100 clones of  E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_3_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.998263313735027

## Top 100 clones of  E4412_2B_CONTROLDNA2.tsv compared to clones of  DFCI_20210908_DNAreference.tsv with d genes 
## Pearson Correlation =  0.997182377273628

## Top 100 clones of  E4412_2B_CONTROLDNA2.tsv compared to clones of  E4412_1A_TIL_CONTROL_DNA.tsv with d genes 
## Pearson Correlation =  0.996685550370382

## Top 100 clones of  E4412_2B_CONTROLDNA2.tsv compared to clones of  E4412_1B_TIL_CONTROL.tsv with d genes 
## Pearson Correlation =  0.997263526038884

## Top 100 clones of  E4412_2B_CONTROLDNA2.tsv compared to clones of  E4412_2A_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.998425379632029

## Top 100 clones of  E4412_2B_CONTROLDNA2.tsv compared to clones of  E4412_3_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.996413523293684

## Top 100 clones of  E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of  DFCI_20210908_DNAreference.tsv with d genes 
## Pearson Correlation =  0.996474185922898

## Top 100 clones of  E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_1A_TIL_CONTROL_DNA.tsv with d genes 
## Pearson Correlation =  0.998047600035829

## Top 100 clones of  E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_1B_TIL_CONTROL.tsv with d genes 
## Pearson Correlation =  0.997991239488617

## Top 100 clones of  E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_2A_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.998373582175545

## Top 100 clones of  E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_2B_CONTROLDNA2.tsv with d genes 
## Pearson Correlation =  0.996444174142193

#make matrix from mycorrs such that rows = top clones data and cols = all clones datas
mycorrs100_mx_withd<-matrix(nrow=length(myfilelist),ncol=length(myfilelist))
for (i in 1:length(myfilelist)){
  for (h in 1:length(myfilelist)){
    if (i==h){
      mycorrs100_mx_withd[i,h]<-NA
  }else{
      mycorrs100_mx_withd[i,h]<-mycorrs100_withd[[i]][[h]]}
  }
}


rownames(mycorrs100_mx_withd)<-NameList
colnames(mycorrs100_mx_withd)<-NameList

#prepare correlation matrix for heatmap
melted_corr100_withd <- melt(mycorrs100_mx_withd)

#plot correlation heatmap
ggplot(data = melted_corr100_withd, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Pearson\nCorrelation") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
  ggtitle("Pearson correlations for top 100 clones with d genes") + 
  xlab("") +
  ylab("")

# Heatmap not very informative, maybe print table?
mycorrs100_mx_withd %>% 
  DT::datatable()
##############################################
### Repeat analysis with top 500 clones  #####
##############################################

#make empty lists for plots and correlations
myplots500_withd<-list()
mycorrs500_withd<-list()
mydims500_withd<-list()

#run loop such that i=top 500 clone tsvs and h=full clone list
#e.g. myplots[[1]][[2]]= a comparison of the top500 clones of file 1 to
# all clones in file 2

for (i in 1:length(myfilelist)){
  #create plot and correlation list for file i within myplot_list
  myplots500_withd[[i]]<-list()
  mycorrs500_withd[[i]]<-list()
  mydims500_withd[[i]]<-list()
  
  for (h in 1:length(myfilelist)){
    if (i==h) next
    tcr500_i_h_withd<-merge_tcrs_withd(myfilelist_top500[[i]],myfilelist[[h]])
    
    p500_withd<-ggplot(tcr500_i_h_withd, aes(x=productive_frequency.x, y= productive_frequency.y)) +
      geom_point()+
      scale_x_log10() +
      scale_y_log10() +
      geom_abline(slope = 1, intercept = 0, linetype=2, color = "red") +
      theme_classic(base_size = 14) +
      ggtitle("correlation of productive frequency of top 500 clones with d genes")
  
    correlation500_withd<-cor(tcr500_i_h_withd$productive_frequency.x, tcr500_i_h_withd$productive_frequency.y)
    dimensions500_withd<-dim(tcr500_i_h_withd)
    
    myplots500_withd[[i]][[h]]<-p500_withd
    mycorrs500_withd[[i]][[h]]<-correlation500_withd
    mydims500_withd[[i]][[h]]<-dimensions500_withd
    
  }
}

names(myplots500_withd)<-list.files(path=indir, full.names=FALSE)
names(mydims500_withd)<-list.files(path=indir, full.names=FALSE)

# function to display all correlations and plots for top100.
# will also write to file
disp_plots_corr500_withd<-function(file1,file2){
  cat(paste("Top 500 clones of ",NameList[file1],"compared to clones of ",NameList[file2],"with d genes","\n"))
  cat(paste("Pearson Correlation = ",mycorrs500_withd[[file1]][[file2]]))
  print(myplots500_withd[[file1]][[file2]])
  #print plots to file
  png_file<-paste(outdir,"top500",NameList[file1],"_",NameList[file2],"withD.png",sep="")
  png(file=png_file,width=600, height=350)
  print(myplots500_withd[[file1]][[file2]])
  dev.off()
}


for (i in 1:length(myfilelist)){
  for (h in 1:length(myfilelist)){
    if (i==h) next
    disp_plots_corr500_withd(i,h)
  }
}
## Top 500 clones of  DFCI_20210908_DNAreference.tsv compared to clones of  E4412_1A_TIL_CONTROL_DNA.tsv with d genes 
## Pearson Correlation =  0.999186870841341

## Top 500 clones of  DFCI_20210908_DNAreference.tsv compared to clones of  E4412_1B_TIL_CONTROL.tsv with d genes 
## Pearson Correlation =  0.99894784452698

## Top 500 clones of  DFCI_20210908_DNAreference.tsv compared to clones of  E4412_2A_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.998980181330736

## Top 500 clones of  DFCI_20210908_DNAreference.tsv compared to clones of  E4412_2B_CONTROLDNA2.tsv with d genes 
## Pearson Correlation =  0.997173541144413

## Top 500 clones of  DFCI_20210908_DNAreference.tsv compared to clones of  E4412_3_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.996551409195004

## Top 500 clones of  E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of  DFCI_20210908_DNAreference.tsv with d genes 
## Pearson Correlation =  0.99917485861993

## Top 500 clones of  E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of  E4412_1B_TIL_CONTROL.tsv with d genes 
## Pearson Correlation =  0.999181521111352

## Top 500 clones of  E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of  E4412_2A_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.999120478472191

## Top 500 clones of  E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of  E4412_2B_CONTROLDNA2.tsv with d genes 
## Pearson Correlation =  0.996615386026278

## Top 500 clones of  E4412_1A_TIL_CONTROL_DNA.tsv compared to clones of  E4412_3_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.998003971636869

## Top 500 clones of  E4412_1B_TIL_CONTROL.tsv compared to clones of  DFCI_20210908_DNAreference.tsv with d genes 
## Pearson Correlation =  0.998941719464135

## Top 500 clones of  E4412_1B_TIL_CONTROL.tsv compared to clones of  E4412_1A_TIL_CONTROL_DNA.tsv with d genes 
## Pearson Correlation =  0.999185288329041

## Top 500 clones of  E4412_1B_TIL_CONTROL.tsv compared to clones of  E4412_2A_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.999330959505323

## Top 500 clones of  E4412_1B_TIL_CONTROL.tsv compared to clones of  E4412_2B_CONTROLDNA2.tsv with d genes 
## Pearson Correlation =  0.997221595028865

## Top 500 clones of  E4412_1B_TIL_CONTROL.tsv compared to clones of  E4412_3_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.997960625495892

## Top 500 clones of  E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of  DFCI_20210908_DNAreference.tsv with d genes 
## Pearson Correlation =  0.998992763082061

## Top 500 clones of  E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_1A_TIL_CONTROL_DNA.tsv with d genes 
## Pearson Correlation =  0.999150147626278

## Top 500 clones of  E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_1B_TIL_CONTROL.tsv with d genes 
## Pearson Correlation =  0.99935016017218

## Top 500 clones of  E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_2B_CONTROLDNA2.tsv with d genes 
## Pearson Correlation =  0.998451306022531

## Top 500 clones of  E4412_2A_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_3_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.998245210457028

## Top 500 clones of  E4412_2B_CONTROLDNA2.tsv compared to clones of  DFCI_20210908_DNAreference.tsv with d genes 
## Pearson Correlation =  0.997184626548416

## Top 500 clones of  E4412_2B_CONTROLDNA2.tsv compared to clones of  E4412_1A_TIL_CONTROL_DNA.tsv with d genes 
## Pearson Correlation =  0.996633877534219

## Top 500 clones of  E4412_2B_CONTROLDNA2.tsv compared to clones of  E4412_1B_TIL_CONTROL.tsv with d genes 
## Pearson Correlation =  0.997239336763174

## Top 500 clones of  E4412_2B_CONTROLDNA2.tsv compared to clones of  E4412_2A_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.99845148024227

## Top 500 clones of  E4412_2B_CONTROLDNA2.tsv compared to clones of  E4412_3_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.996392082408524

## Top 500 clones of  E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of  DFCI_20210908_DNAreference.tsv with d genes 
## Pearson Correlation =  0.996614269322668

## Top 500 clones of  E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_1A_TIL_CONTROL_DNA.tsv with d genes 
## Pearson Correlation =  0.998103238434014

## Top 500 clones of  E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_1B_TIL_CONTROL.tsv with d genes 
## Pearson Correlation =  0.998060457007807

## Top 500 clones of  E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_2A_TIL_CONTROL_DNA2.tsv with d genes 
## Pearson Correlation =  0.998373605841485

## Top 500 clones of  E4412_3_TIL_CONTROL_DNA2.tsv compared to clones of  E4412_2B_CONTROLDNA2.tsv with d genes 
## Pearson Correlation =  0.996468410215163

#make matrix from mycorrs such that rows = top clones data and cols = all clones datas
mycorrs500_mx_withd<-matrix(nrow=length(myfilelist),ncol=length(myfilelist))
for (i in 1:length(myfilelist)){
  for (h in 1:length(myfilelist)){
    if (i==h){
      mycorrs500_mx_withd[i,h]<-NA
  }else{
      mycorrs500_mx_withd[i,h]<-mycorrs500_withd[[i]][[h]]}
  }
}


rownames(mycorrs500_mx_withd)<-NameList
colnames(mycorrs500_mx_withd)<-NameList

#prepare correlation matrix for heatmap
melted_corr500_withd <- melt(mycorrs500_mx_withd)

#plot correlation heatmap
ggplot(data = melted_corr500_withd, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Pearson\nCorrelation") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
  ggtitle("Pearson correlations for top 500 clones with d genes") + 
  xlab("") +
  ylab("")

# Heatmap not very informative, maybe print table?
mycorrs500_mx_withd %>% 
  DT::datatable()
######### print dimensions to evaluate if some clones are being lost with the inner join

#make matrix from mydims such that rows = top clones data and cols = all clones datas
# for top50 with d genes
mydims_mx_withd<-matrix(nrow=length(myfilelist),ncol=length(myfilelist))
for (i in 1:length(myfilelist)){
  for (h in 1:length(myfilelist)){
    if (i==h){
      mydims_mx_withd[i,h]<-NA
  }else{
      mydims_mx_withd[i,h]<-mydims_withd[[i]][[h]][[1]]}
  }
}

rownames(mydims_mx_withd)<-NameList
colnames(mydims_mx_withd)<-NameList

#print matrix
print("Total number of clones after merge.  Rows = Top 50 clone file.")
## [1] "Total number of clones after merge.  Rows = Top 50 clone file."
mydims_mx_withd %>% 
  DT::datatable()
# for top100 with d genes
mydims100_mx_withd<-matrix(nrow=length(myfilelist),ncol=length(myfilelist))
for (i in 1:length(myfilelist)){
  for (h in 1:length(myfilelist)){
    if (i==h){
      mydims100_mx_withd[i,h]<-NA
  }else{
      mydims100_mx_withd[i,h]<-mydims100_withd[[i]][[h]][[1]]}
  }
}

rownames(mydims100_mx_withd)<-NameList
colnames(mydims100_mx_withd)<-NameList

#print matrix
print("Total number of clones after merge.  Rows = Top 100 clone file.")
## [1] "Total number of clones after merge.  Rows = Top 100 clone file."
mydims100_mx_withd %>% 
  DT::datatable()
# for top500 with d genes
mydims500_mx_withd<-matrix(nrow=length(myfilelist),ncol=length(myfilelist))
for (i in 1:length(myfilelist)){
  for (h in 1:length(myfilelist)){
    if (i==h){
      mydims500_mx_withd[i,h]<-NA
  }else{
      mydims500_mx_withd[i,h]<-mydims500_withd[[i]][[h]][[1]]}
  }
}


rownames(mydims500_mx_withd)<-NameList
colnames(mydims500_mx_withd)<-NameList

#print matrix
print("Total number of clones after merge.  Rows = Top 500 clone file.")
## [1] "Total number of clones after merge.  Rows = Top 500 clone file."
mydims500_mx_withd %>% 
  DT::datatable()

Including D genes does not seem significantly affect the results. Marking earlier code as eval=FALSE. Will knit with just with d genes code evaluated.

 #calculate mean productive frequencies for all files
top50_clone_freq_mean<-mean(prod_freq_matrix[,"top50_mean"])
top100_clone_freq_mean<-mean(prod_freq_matrix[,"top100_mean"])
top500_clone_freq_mean<-mean(prod_freq_matrix[,"top500_mean"])
all_clone_freq_mean<-mean(prod_freq_matrix[,"all_clones_mean"])

# calculate fraction of clones with freq <0.001
top50_freq_one_tenth<-sum(myfilelist_top50[[1]]$productive_frequency <0.001)/sum(myfilelist_top50[[1]]$productive_frequency>0)
top100_freq_one_tenth<-sum(myfilelist_top100[[1]]$productive_frequency <0.001)/sum(myfilelist_top100[[1]]$productive_frequency>0)
top500_freq_one_tenth<-sum(myfilelist_top500[[1]]$productive_frequency <0.001)/sum(myfilelist_top500[[1]]$productive_frequency>0)
all_freq_one_tenth<-sum(myfilelist[[1]]$productive_frequency <0.001)/sum(myfilelist[[1]]$productive_frequency>0)

# calculate fraction of clones with freq <0.01
top50_freq_one<-sum(myfilelist_top50[[1]]$productive_frequency<0.01)/sum(myfilelist_top50[[1]]$productive_frequency>0)
top100_freq_one<-sum(myfilelist_top100[[1]]$productive_frequency<0.01)/sum(myfilelist_top100[[1]]$productive_frequency>0)
top500_freq_one<-sum(myfilelist_top500[[1]]$productive_frequency<0.01)/sum(myfilelist_top500[[1]]$productive_frequency>0)
all_freq_one<-sum(myfilelist[[1]]$productive_frequency<0.01)/sum(myfilelist[[1]]$productive_frequency>0)
# make table of original file lengths

total_clones<-matrix(nrow=length(myfilelist),ncol=1)
for (i in 1:length(myfilelist)){
  total_clones[i,1]<-dim(myfilelist[[i]])[[1]]
}

rownames(total_clones)<- NameList
colnames(total_clones)<-c("Total Clones")

#print matrix
print("Total clones in each file")
## [1] "Total clones in each file"
total_clones %>% 
  DT::datatable()

Sept 16,2021 # Make a histogram of the productive frequencies of each sample

#create empty data frame
prod_freq_df<-data.frame()


# add productive frequencies to the dataframe
for (i in 1:length(myfilelist)){
  #make a dataframe from the ith productive frequency numbers
  temp_df<-data.frame(productive_frequency=myfilelist[[i]]$productive_frequency)
  #add info about where data came from
  temp_df$fileName <- names(myfilelist[i])
  # merge dataframes to make one large dataframe
  prod_freq_df<- rbind(prod_freq_df,temp_df)
}

ggplot(prod_freq_df, aes(productive_frequency, fill = fileName)) + 
   geom_histogram(alpha = 0.5, bins=100, position = 'identity') +
   scale_x_log10() +
   geom_abline(slope = 1, intercept = 0, linetype=2) +
   theme_classic(base_size = 14) 

ggplot(prod_freq_df, aes(productive_frequency, fill = fileName)) + 
   geom_histogram(alpha = 0.5, bins=100) +
   scale_x_log10() +
   geom_abline(slope = 1, intercept = 0, linetype=2) +
   theme_classic(base_size = 14)